// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Derrick Oswald
//
// Revision Control Information
//
// $URL: https://svn.sourceforge.net/svnroot/htmlparser/trunk/parser/src/main/java/org/htmlparser/sax/XMLReader.java $
// $Author: derrickoswald $
// $Date: 2006-09-16 10:44:17 -0400 (Sat, 16 Sep 2006) $
// $Revision: 4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
//   http://opensource.org/licenses/cpl1.0.php

package org.htmlparser.sax;

import java.io.IOException;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;

import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.NamespaceSupport;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;

/**
 * SAX parser. Generates callbacks on the {@link ContentHandler} based on
 * encountered nodes. <br>
 * <em>Preliminary</em>.
 * 
 * <pre>
 * org.xml.sax.XMLReader reader = org.xml.sax.helpers.XMLReaderFactory.createXMLReader(&quot;org.htmlparser.sax.XMLReader&quot;);
 * org.xml.sax.ContentHandler content = new MyContentHandler();
 * reader.setContentHandler(content);
 * org.xml.sax.ErrorHandler errors = new MyErrorHandler();
 * reader.setErrorHandler(errors);
 * reader.parse(&quot;http://cbc.ca&quot;);
 * </pre>
 */
public class XMLReader implements org.xml.sax.XMLReader {
	/**
	 * Determines if namespace handling is on. All XMLReaders are required to
	 * recognize the feature names:
	 * <ul>
	 * <li><code>http://xml.org/sax/features/namespaces</code> - a value of
	 * "true" indicates namespace URIs and unprefixed local names for element
	 * and attribute names will be available</li>
	 * <li><code>http://xml.org/sax/features/namespace-prefixes</code> - a
	 * value of "true" indicates that XML qualified names (with prefixes) and
	 * attributes (including xmlns* attributes) will be available.
	 * </ul>
	 */
	protected boolean mNameSpaces; // namespaces

	/**
	 * Determines if namespace prefix handling is on.
	 * 
	 * @see #mNameSpaces
	 */
	protected boolean mNameSpacePrefixes; // namespace-prefixes

	/**
	 * <em> not implemented</em>
	 */
	protected EntityResolver mEntityResolver;

	/**
	 * <em> not implemented</em>
	 */
	protected DTDHandler mDTDHandler;

	/**
	 * The content callback object.
	 */
	protected ContentHandler mContentHandler;

	/**
	 * The error handler object.
	 */
	protected ErrorHandler mErrorHandler;

	/**
	 * The underlying DOM parser.
	 */
	protected Parser mParser;

	/**
	 * Namspace utility object.
	 */
	protected NamespaceSupport mSupport;

	/**
	 * Qualified name parts.
	 */
	protected String mParts[];

	/**
	 * Create an SAX parser.
	 */
	public XMLReader() {
		mNameSpaces = true;
		mNameSpacePrefixes = false;

		mEntityResolver = null;
		mDTDHandler = null;
		mContentHandler = null;
		mErrorHandler = null;

		mSupport = new NamespaceSupport();
		mSupport.pushContext();
		mSupport.declarePrefix("", "http://www.w3.org/TR/REC-html40");
		// todo:
		// xmlns:html='http://www.w3.org/TR/REC-html40'
		// or xmlns:html='http://www.w3.org/1999/xhtml'
		mParts = new String[3];
	}

	// //////////////////////////////////////////////////////////////////
	// Configuration.
	// //////////////////////////////////////////////////////////////////

	/**
	 * Look up the value of a feature flag.
	 * 
	 * <p>
	 * The feature name is any fully-qualified URI. It is possible for an
	 * XMLReader to recognize a feature name but temporarily be unable to return
	 * its value. Some feature values may be available only in specific
	 * contexts, such as before, during, or after a parse. Also, some feature
	 * values may not be programmatically accessible. (In the case of an adapter
	 * for SAX1 {@link Parser}, there is no implementation-independent way to
	 * expose whether the underlying parser is performing validation, expanding
	 * external entities, and so forth.)
	 * </p>
	 * 
	 * <p>
	 * All XMLReaders are required to recognize the
	 * http://xml.org/sax/features/namespaces and the
	 * http://xml.org/sax/features/namespace-prefixes feature names.
	 * </p>
	 * 
	 * <p>
	 * Typical usage is something like this:
	 * </p>
	 * 
	 * <pre>
	 * XMLReader r = new MySAXDriver();
	 * 
	 * // try to activate validation
	 * try {
	 * 	r.setFeature(&quot;http://xml.org/sax/features/validation&quot;, true);
	 * } catch (SAXException e) {
	 * 	System.err.println(&quot;Cannot activate validation.&quot;);
	 * }
	 * 
	 * // register event handlers
	 * r.setContentHandler(new MyContentHandler());
	 * r.setErrorHandler(new MyErrorHandler());
	 * 
	 * // parse the first document
	 * try {
	 * 	r.parse(&quot;http://www.foo.com/mydoc.xml&quot;);
	 * } catch (IOException e) {
	 * 	System.err.println(&quot;I/O exception reading XML document&quot;);
	 * } catch (SAXException e) {
	 * 	System.err.println(&quot;XML exception reading document.&quot;);
	 * }
	 * </pre>
	 * 
	 * <p>
	 * Implementors are free (and encouraged) to invent their own features,
	 * using names built on their own URIs.
	 * </p>
	 * 
	 * @param name
	 *            The feature name, which is a fully-qualified URI.
	 * @return The current value of the feature (true or false).
	 * @exception org.xml.sax.SAXNotRecognizedException
	 *                If the feature value can't be assigned or retrieved.
	 * @exception org.xml.sax.SAXNotSupportedException
	 *                When the XMLReader recognizes the feature name but cannot
	 *                determine its value at this time.
	 * @see #setFeature
	 */
	public boolean getFeature(String name) throws SAXNotRecognizedException, SAXNotSupportedException {
		boolean ret;

		if (name.equals("http://xml.org/sax/features/namespaces"))
			ret = mNameSpaces;
		else if (name.equals("http://xml.org/sax/features/namespace-prefixes"))
			ret = mNameSpacePrefixes;
		else
			throw new SAXNotSupportedException(name + " not yet understood");

		return (ret);
	}

	/**
	 * Set the value of a feature flag.
	 * 
	 * <p>
	 * The feature name is any fully-qualified URI. It is possible for an
	 * XMLReader to expose a feature value but to be unable to change the
	 * current value. Some feature values may be immutable or mutable only in
	 * specific contexts, such as before, during, or after a parse.
	 * </p>
	 * 
	 * <p>
	 * All XMLReaders are required to support setting
	 * http://xml.org/sax/features/namespaces to true and
	 * http://xml.org/sax/features/namespace-prefixes to false.
	 * </p>
	 * 
	 * @param name
	 *            The feature name, which is a fully-qualified URI.
	 * @param value
	 *            The requested value of the feature (true or false).
	 * @exception org.xml.sax.SAXNotRecognizedException
	 *                If the feature value can't be assigned or retrieved.
	 * @exception org.xml.sax.SAXNotSupportedException
	 *                When the XMLReader recognizes the feature name but cannot
	 *                set the requested value.
	 * @see #getFeature
	 */
	public void setFeature(String name, boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
		if (name.equals("http://xml.org/sax/features/namespaces"))
			mNameSpaces = value;
		else if (name.equals("http://xml.org/sax/features/namespace-prefixes"))
			mNameSpacePrefixes = value;
		else
			throw new SAXNotSupportedException(name + " not yet understood");
	}

	/**
	 * Look up the value of a property.
	 * 
	 * <p>
	 * The property name is any fully-qualified URI. It is possible for an
	 * XMLReader to recognize a property name but temporarily be unable to
	 * return its value. Some property values may be available only in specific
	 * contexts, such as before, during, or after a parse.
	 * </p>
	 * 
	 * <p>
	 * XMLReaders are not required to recognize any specific property names,
	 * though an initial core set is documented for SAX2.
	 * </p>
	 * 
	 * <p>
	 * Implementors are free (and encouraged) to invent their own properties,
	 * using names built on their own URIs.
	 * </p>
	 * 
	 * @param name
	 *            The property name, which is a fully-qualified URI.
	 * @return The current value of the property.
	 * @exception org.xml.sax.SAXNotRecognizedException
	 *                If the property value can't be assigned or retrieved.
	 * @exception org.xml.sax.SAXNotSupportedException
	 *                When the XMLReader recognizes the property name but cannot
	 *                determine its value at this time.
	 * @see #setProperty
	 */
	public Object getProperty(String name) throws SAXNotRecognizedException, SAXNotSupportedException {
		throw new SAXNotSupportedException(name + " not yet understood");
	}

	/**
	 * Set the value of a property.
	 * 
	 * <p>
	 * The property name is any fully-qualified URI. It is possible for an
	 * XMLReader to recognize a property name but to be unable to change the
	 * current value. Some property values may be immutable or mutable only in
	 * specific contexts, such as before, during, or after a parse.
	 * </p>
	 * 
	 * <p>
	 * XMLReaders are not required to recognize setting any specific property
	 * names, though a core set is defined by SAX2.
	 * </p>
	 * 
	 * <p>
	 * This method is also the standard mechanism for setting extended handlers.
	 * </p>
	 * 
	 * @param name
	 *            The property name, which is a fully-qualified URI.
	 * @param value
	 *            The requested value for the property.
	 * @exception org.xml.sax.SAXNotRecognizedException
	 *                If the property value can't be assigned or retrieved.
	 * @exception org.xml.sax.SAXNotSupportedException
	 *                When the XMLReader recognizes the property name but cannot
	 *                set the requested value.
	 */
	public void setProperty(String name, Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
		throw new SAXNotSupportedException(name + " not yet understood");
	}

	// //////////////////////////////////////////////////////////////////
	// Event handlers.
	// //////////////////////////////////////////////////////////////////

	/**
	 * Allow an application to register an entity resolver.
	 * 
	 * <p>
	 * If the application does not register an entity resolver, the XMLReader
	 * will perform its own default resolution.
	 * </p>
	 * 
	 * <p>
	 * Applications may register a new or different resolver in the middle of a
	 * parse, and the SAX parser must begin using the new resolver immediately.
	 * </p>
	 * 
	 * @param resolver
	 *            The entity resolver.
	 * @see #getEntityResolver
	 */
	public void setEntityResolver(EntityResolver resolver) {
		mEntityResolver = resolver;
	}

	/**
	 * Return the current entity resolver.
	 * 
	 * @return The current entity resolver, or null if none has been registered.
	 * @see #setEntityResolver
	 */
	public EntityResolver getEntityResolver() {
		return (mEntityResolver);
	}

	/**
	 * Allow an application to register a DTD event handler.
	 * 
	 * <p>
	 * If the application does not register a DTD handler, all DTD events
	 * reported by the SAX parser will be silently ignored.
	 * </p>
	 * 
	 * <p>
	 * Applications may register a new or different handler in the middle of a
	 * parse, and the SAX parser must begin using the new handler immediately.
	 * </p>
	 * 
	 * @param handler
	 *            The DTD handler.
	 * @see #getDTDHandler
	 */
	public void setDTDHandler(DTDHandler handler) {
		mDTDHandler = handler;
	}

	/**
	 * Return the current DTD handler.
	 * 
	 * @return The current DTD handler, or null if none has been registered.
	 * @see #setDTDHandler
	 */
	public DTDHandler getDTDHandler() {
		return (mDTDHandler);
	}

	/**
	 * Allow an application to register a content event handler.
	 * 
	 * <p>
	 * If the application does not register a content handler, all content
	 * events reported by the SAX parser will be silently ignored.
	 * </p>
	 * 
	 * <p>
	 * Applications may register a new or different handler in the middle of a
	 * parse, and the SAX parser must begin using the new handler immediately.
	 * </p>
	 * 
	 * @param handler
	 *            The content handler.
	 * @see #getContentHandler
	 */
	public void setContentHandler(ContentHandler handler) {
		mContentHandler = handler;
	}

	/**
	 * Return the current content handler.
	 * 
	 * @return The current content handler, or null if none has been registered.
	 * @see #setContentHandler
	 */
	public ContentHandler getContentHandler() {
		return (mContentHandler);
	}

	/**
	 * Allow an application to register an error event handler.
	 * 
	 * <p>
	 * If the application does not register an error handler, all error events
	 * reported by the SAX parser will be silently ignored; however, normal
	 * processing may not continue. It is highly recommended that all SAX
	 * applications implement an error handler to avoid unexpected bugs.
	 * </p>
	 * 
	 * <p>
	 * Applications may register a new or different handler in the middle of a
	 * parse, and the SAX parser must begin using the new handler immediately.
	 * </p>
	 * 
	 * @param handler
	 *            The error handler.
	 * @see #getErrorHandler
	 */
	public void setErrorHandler(ErrorHandler handler) {
		mErrorHandler = handler;
	}

	/**
	 * Return the current error handler.
	 * 
	 * @return The current error handler, or null if none has been registered.
	 * @see #setErrorHandler
	 */
	public ErrorHandler getErrorHandler() {
		return (mErrorHandler);
	}

	// //////////////////////////////////////////////////////////////////
	// Parsing.
	// //////////////////////////////////////////////////////////////////

	/**
	 * Parse an XML document.
	 * 
	 * <p>
	 * The application can use this method to instruct the XML reader to begin
	 * parsing an XML document from any valid input source (a character stream,
	 * a byte stream, or a URI).
	 * </p>
	 * 
	 * <p>
	 * Applications may not invoke this method while a parse is in progress
	 * (they should create a new XMLReader instead for each nested XML
	 * document). Once a parse is complete, an application may reuse the same
	 * XMLReader object, possibly with a different input source. Configuration
	 * of the XMLReader object (such as handler bindings and values established
	 * for feature flags and properties) is unchanged by completion of a parse,
	 * unless the definition of that aspect of the configuration explicitly
	 * specifies other behavior. (For example, feature flags or properties
	 * exposing characteristics of the document being parsed.)
	 * </p>
	 * 
	 * <p>
	 * During the parse, the XMLReader will provide information about the XML
	 * document through the registered event handlers.
	 * </p>
	 * 
	 * <p>
	 * This method is synchronous: it will not return until parsing has ended.
	 * If a client application wants to terminate parsing early, it should throw
	 * an exception.
	 * </p>
	 * 
	 * @param input
	 *            The input source for the top-level of the XML document.
	 * @exception org.xml.sax.SAXException
	 *                Any SAX exception, possibly wrapping another exception.
	 * @exception java.io.IOException
	 *                An IO exception from the parser, possibly from a byte
	 *                stream or character stream supplied by the application.
	 * @see org.xml.sax.InputSource
	 * @see #parse(java.lang.String)
	 * @see #setEntityResolver
	 * @see #setDTDHandler
	 * @see #setContentHandler
	 * @see #setErrorHandler
	 */
	public void parse(InputSource input) throws IOException, SAXException {
		Locator locator;
		ParserFeedback feedback;

		if (null != mContentHandler)
			try {
				mParser = new Parser(new Lexer(new Page(input.getByteStream(), input.getEncoding())));
				locator = new Locator(mParser);
				if (null != mErrorHandler)
					feedback = new Feedback(mErrorHandler, locator);
				else
					feedback = new DefaultParserFeedback(0);
				mParser.setFeedback(feedback);
				mContentHandler.setDocumentLocator(locator);
				try {
					mContentHandler.startDocument();
					for (NodeIterator iterator = mParser.elements(); iterator.hasMoreNodes(); doSAX(iterator.nextNode()))
						;
					mContentHandler.endDocument();
				} catch (SAXException se) {
					if (null != mErrorHandler)
						mErrorHandler.fatalError(new SAXParseException("contentHandler threw me", locator, se));
				}
			} catch (ParserException pe) {
				if (null != mErrorHandler)
					mErrorHandler.fatalError(new SAXParseException(pe.getMessage(), "", "", 0, 0));
			}
	}

	/**
	 * Parse an XML document from a system identifier (URI).
	 * 
	 * <p>
	 * This method is a shortcut for the common case of reading a document from
	 * a system identifier. It is the exact equivalent of the following:
	 * </p>
	 * 
	 * <pre>
	 * parse(new InputSource(systemId));
	 * </pre>
	 * 
	 * <p>
	 * If the system identifier is a URL, it must be fully resolved by the
	 * application before it is passed to the parser.
	 * </p>
	 * 
	 * @param systemId
	 *            The system identifier (URI).
	 * @exception org.xml.sax.SAXException
	 *                Any SAX exception, possibly wrapping another exception.
	 * @exception java.io.IOException
	 *                An IO exception from the parser, possibly from a byte
	 *                stream or character stream supplied by the application.
	 * @see #parse(org.xml.sax.InputSource)
	 */
	public void parse(String systemId) throws IOException, SAXException {
		Locator locator;
		ParserFeedback feedback;

		if (null != mContentHandler)
			try {
				mParser = new Parser(systemId);
				locator = new Locator(mParser);
				if (null != mErrorHandler)
					feedback = new Feedback(mErrorHandler, locator);
				else
					feedback = new DefaultParserFeedback(DefaultParserFeedback.QUIET);
				mParser.setFeedback(feedback);

				// OK, try a simplistic parse
				mContentHandler.setDocumentLocator(locator);
				try {
					mContentHandler.startDocument();
					for (NodeIterator iterator = mParser.elements(); iterator.hasMoreNodes();)
						doSAX(iterator.nextNode());
					mContentHandler.endDocument();
				} catch (SAXException se) {
					if (null != mErrorHandler)
						mErrorHandler.fatalError(new SAXParseException("contentHandler threw me", locator, se));
				}
			} catch (ParserException pe) {
				if (null != mErrorHandler)
					mErrorHandler.fatalError(new SAXParseException(pe.getMessage(), "", systemId, 0, 0));

			}
	}

	/**
	 * Process nodes recursively on the DocumentHandler. Calls methods on the
	 * handler based on the type and whether it's an end tag. Processes
	 * composite tags recursively. Does rudimentary namespace processing
	 * according to the state of {@link #mNameSpaces} and
	 * {@link #mNameSpacePrefixes}.
	 * 
	 * @param node
	 *            The htmlparser node to traverse.
	 * @exception ParserException
	 *                If a parse error occurs.
	 * @exception SAXException
	 *                If a SAX error occurs.
	 */
	protected void doSAX(Node node) throws ParserException, SAXException {
		Tag tag;
		Tag end;

		if (node instanceof Remark) {
			String text = mParser.getLexer().getPage().getText(node.getStartPosition(), node.getEndPosition());
			mContentHandler.ignorableWhitespace(text.toCharArray(), 0, text.length());
		} else if (node instanceof Text) {
			String text = mParser.getLexer().getPage().getText(node.getStartPosition(), node.getEndPosition());
			mContentHandler.characters(text.toCharArray(), 0, text.length());
		} else if (node instanceof Tag) {
			tag = (Tag) node;
			if (mNameSpaces)
				mSupport.processName(tag.getTagName(), mParts, false);
			else {
				mParts[0] = "";
				mParts[1] = "";
			}
			if (mNameSpacePrefixes)
				mParts[2] = tag.getTagName();
			else if (mNameSpaces)
				mParts[2] = "";
			else
				mParts[2] = tag.getTagName();

			mContentHandler.startElement(mParts[0], // uri
					mParts[1], // local
					mParts[2], // raw
					new Attributes(tag, mSupport, mParts));
			NodeList children = tag.getChildren();
			if (null != children)
				for (int i = 0; i < children.size(); i++)
					doSAX(children.elementAt(i));
			end = tag.getEndTag();
			if (null != end) {
				if (mNameSpaces)
					mSupport.processName(end.getTagName(), mParts, false);
				else {
					mParts[0] = "";
					mParts[1] = "";
				}
				if (mNameSpacePrefixes)
					mParts[2] = end.getTagName();
				else if (mNameSpaces)
					mParts[2] = "";
				else
					mParts[2] = end.getTagName();
				mContentHandler.endElement(mParts[0], // uri
						mParts[1], // local
						mParts[2]); // raw
			}
		}
	}
}
